##############
#Automatically set working directory and library paths
##############

# Get command line arguments
args <- commandArgs(trailingOnly = FALSE)

# Identify script file path from args
file_arg_index <- grep("--file=", args)
if (length(file_arg_index) > 0) {
  script_path <- sub("--file=", "", args[file_arg_index])  # Extract script path
  script_dir <- dirname(script_path)  # Get script directory
  
  # Set working directory to the parent directory of the script
  setwd(dirname(script_dir))
  
  # Define custom library path relative to the working directory
  custom_lib_path <- file.path(getwd(), "scripts/libraries/R")
  
  # Add to library paths
  .libPaths(c(custom_lib_path, .libPaths()))
}

##############
# Check if correct working directory and R version are loaded
##############
cat("Verify correct R version is loaded", as.character(getRversion()), "\n")
cat("Verify correct working directory:", getwd(), "\n")

##############
# Load libraries
##############
library(tidyverse)
library(foreign)
library(GPArotation)
library(haven)
library(dplyr)
library(plyr, include.only = "rbind.fill")
library(pracma)

##############
# Import required data sets
##############

#Wave 1 in home data
home_w1 <-    read.xport(file.path("rawdata", "add health", "Core Files - Wave I",   "Wave I In Home Interview Data", "allwave1.xpt"))
#Wave 2 in home data
home_w2 <-    read.xport(file.path("rawdata", "add health", "Core Files - Wave II",  "Wave II In Home Interview Data", "wave2.xpt"))
#Wave 3 in home data
home_w3 <-    read.xport(file.path("rawdata", "add health", "Core Files - Wave III", "Wave III In Home Interview Data", "wave3", "wave3.xpt"))
ahpvt_w3 <-   read.xport(file.path("rawdata", "add health", "Core Files - Wave III", "Wave III In Home Interview Data", "ahpvt3", "ahpvt3.xpt"))
#Wave 4 in home data
home_w4 <-    read.xport(file.path("rawdata", "add health", "Core Files - Wave IV", "Wave IV In Home Interview Data", "wave4", "wave4.xpt"))
sect18_w4 <-  read.xport(file.path("rawdata", "add health", "Core Files - Wave IV", "Wave IV In Home Interview Data", "wave4a", "sect18.xpt"))
sect19_w4 <-  read.xport(file.path("rawdata", "add health", "Core Files - Wave IV", "Wave IV In Home Interview Data", "wave4a", "sect19.xpt"))
#Wave 5 in home data
home_w5 <-    read.xport(file.path("rawdata", "add health", "Core Files - Wave V", "Wave V Mixed-Mode Survey Data", "wave5.xpt"))
#Genetic data
gen_pgs <-   read.xport(file.path("rawdata", "add health", "Genetic Files", "Wave IV Polygenic Scores - Release 2", "pgs2.xpt"))
#Contextual files
w5group <-    read.xport(file.path("rawdata", "add health", "Contextual Files", "Wave I, II, III, IV & V Grouping Data", "W1_5GRP.xpt"))
context1 <-   read.xport(file.path("rawdata", "add health", "Contextual Files", "Wave I Contextual Data", "Context1.xpt"))
w3group <-    read.xport(file.path("rawdata", "add health", "Contextual Files", "Wave III Grouping File Data", "w3group.xpt"))
w4group <-    read.xport(file.path("rawdata", "add health", "Contextual Files", "Wave IV Grouping File Data", "w4group.xpt"))
w3region <-   read.xport(file.path("rawdata", "add health", "Contextual Files", "Wave III Region", "w3region.xpt"))
w4region <-   read.xport(file.path("rawdata", "add health", "Contextual Files", "Wave IV Region", "w4region.xpt"))
#Sibling data
sibling3 <-   read.xport(file.path("rawdata", "add health", "Sibling Files", "Wave III Sibling IDs", "sibling3.xpt"))
#Weights
w1homewc <-   read.xport(file.path("rawdata", "add health", "Additional Weight Files", "Wave I In-Home Weight Components", "w1homewc.xpt"))
w2homewc <-   read.xport(file.path("rawdata", "add health", "Additional Weight Files", "Wave II In-Home Weight Components", "w2homewc.xpt"))
w3homewc <-   read.xport(file.path("rawdata", "add health", "Additional Weight Files", "Wave III In-Home Weight Components", "w3homewc.xpt"))
w4homewc <-   read.xport(file.path("rawdata", "add health", "Additional Weight Files", "Wave IV In-Home Weight Components", "w4homewc.xpt"))
weights5 <-   read.xport(file.path("rawdata", "add health", "Core Files - Wave V", "Wave V Mixed-Mode Survey Weights", "weights5.xpt"))
schwt    <-   read.xport(file.path("rawdata", "add health", "Additional Weight Files", "Add Health School Weights", "schwt.xpt"))
#Other constructed variables
conses <-     read.xport(file.path("rawdata", "add health", "Constructed Files", "Constructed SES Variables", "conses.xpt"))
w4constr <-   read.xport(file.path("rawdata", "add health", "Constructed Files", "Wave IV Constructed Variable", "w4vars.xpt"))

##############
# In-home QRE W1
##############
home1 <- home_w1 %>%
  select(AID, SCID, BIO_SEX, H1RE1, H1ED11, H1ED12, H1ED14, H1GI20, AH_PVT, H1GI9) %>%
  rename(aid = AID,
         scid = SCID,
         gender = BIO_SEX,
         rel = H1RE1,
         race_interviewer=H1GI9,
         grade_eng=H1ED11,
         grade_mat=H1ED12,
         grade_sci=H1ED14,
         pvt_w1=AH_PVT,
         grade_w1=H1GI20) %>%
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid),
         scid = as.character(scid)) %>% 
  mutate(scid = replace(scid, scid == "999", NA),
         gender = replace(gender, gender == 6, NaN),
         gender = replace(gender, gender == 8, NaN),
         gender = replace(gender, gender == 1,0),
         gender = replace(gender, gender == 2,1),
         grade_eng = replace(-grade_eng, grade_eng > 4, NaN),
         grade_mat = replace(-grade_mat, grade_mat > 4, NaN),
         grade_sci = replace(-grade_sci, grade_sci > 4, NaN),
         grade_w1 = replace(grade_w1, grade_w1 > 12, NaN),
         race_interviewer = replace(race_interviewer, race_interviewer > 6, NaN),
         rel = case_when(rel == 0 ~ 0,
                         rel %in% c(1:19, 22) ~ 1,
                         rel %in% c(21, 24) ~ 0,
                         rel == 25 ~ 0,
                         rel == 26 ~ 0,
                         rel %in% c(20, 23, 27, 28) ~ 0)) %>%
  select(aid, scid, gender, rel, race_interviewer, grade_eng, grade_mat, grade_sci, pvt_w1, grade_w1)

##############
# Race/Ethnicity
##############
home_w1_temp <- home_w1 %>%
  select(AID, H1GI6A, H1GI6B, H1GI4, H1GI8) %>%
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid)) %>% 
  mutate(reth_aid1 = case_when(h1gi4==1 ~ 3, # hispanic
                               h1gi6b==2 & h1gi6a!=1 ~ 2, # black, non-hispanic
                               h1gi6a==1 & h1gi6b!=2 ~ 1, # white, non-hispanic
                               h1gi6b==2 & h1gi6a==1 & h1gi8==2 ~ 2,
                               h1gi6a==1 & h1gi6b==2 & h1gi8==1 ~ 1)) %>%
  select(aid, reth_aid1)

home_w3_temp <- home_w3 %>% 
  select(AID, H3OD2, H3OD4A, H3OD4B, H3OD6) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid)) %>%
  mutate(reth_aid3 = case_when(h3od2==1 ~ 3, # hispanic
                               h3od4b==1 & h3od4a!=1 ~ 2, # black, non-hispanic
                               h3od4a==1 & h3od4b!=1 ~ 1, # white, non-hispanic
                               h3od4b==1 & h3od4a==1 & h3od6==2 ~ 2,
                               h3od4a==1 & h3od4b==1 & h3od6==1 ~ 1)) %>% 
  select(aid, reth_aid3)

race <- home_w1_temp %>% 
  full_join(home_w3_temp, by = "aid") %>%
  mutate(reth_aid = case_when(!(is.na(reth_aid1)) ~ reth_aid1,
                              !(is.na(reth_aid3)) ~ reth_aid3)) %>%
  select(aid, reth_aid)

rm(home_w1_temp, home_w3_temp)


##############
# Parental SES
##############
belsky <- conses %>%
  mutate(AID = as.character(AID)) %>%
  rename(sesbelsky = SESPC_AL) %>%
  rename_all(tolower) %>% 
  select(aid, sesbelsky)

ses <- home_w1 %>%
  select(AID, starts_with("PA57"), PA59, PA55, PA56) %>%
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid)) %>%
  mutate_at(vars(starts_with("pa57")), ~ replace(., . == 6, NA)) %>% 
  mutate_at(vars(starts_with("pa57")), ~ scale(., center = TRUE, scale = TRUE)) %>% 
  mutate(inc = replace(pa55, pa55 == 9996, NA)) %>% 
  rowwise() %>% 
  mutate(welfare = ifelse(all(is.na(c(pa57a, pa57b, pa57c, pa57d, pa57e, pa57f))), NA, 
                          sum(pa57a, pa57b, pa57c, pa57d, pa57e, pa57f, na.rm = TRUE))) %>% 
  ungroup() %>% 
  mutate(medcare = addNA(factor(pa59,
                                levels = c(4, 3, 2, 1),
                                labels = c("very hard", "somewhat hard", "somewhat easy", "very easy")),
                         ifany = TRUE),
         bills = addNA(factor(pa56,
                              levels = c(0, 1),
                              labels = c("not enough", "enough")),
                       ifany = TRUE),
         inc_ln = log(inc + 1)) %>% 
  select(aid, welfare, medcare, bills, inc_ln)

ses<-ses%>%
  full_join(belsky, by = "aid")

rm(belsky)

##############
# Health
##############
home_w1_temp <- home_w1 %>% 
  select(AID, H1GH1) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w2_temp <- home_w2 %>% 
  select(AID, H2GH1) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w3_temp <- home_w3 %>% 
  select(AID, H3GH1) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w4_temp <- home_w4 %>% 
  select(AID, H4GH1) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

constr_w4_temp <- w4constr %>% 
  select(AID, C4VAR034, C4VAR045, C4VAR047) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

#Transformation into QALY according to van Dorslaer and Jones (2003)
health <- home_w1_temp %>% 
  full_join(home_w2_temp, by = "aid") %>% 
  full_join(home_w3_temp, by = "aid") %>% 
  full_join(home_w4_temp, by = "aid") %>% 
  full_join(constr_w4_temp, by = "aid") %>% 
  mutate_at(vars(h1gh1, h2gh1, h3gh1, h4gh1), ~recode(., 
                                                      `5`=0.401, 
                                                      `4`=0.707,
                                                      `3`=0.841,
                                                      `2`=0.9311,
                                                      `1`=0.9833,
                                                      `6`=NaN,
                                                      `8`=NaN,
                                                      `96`=NaN,
                                                      `98`=NaN)) %>%
  mutate(health_sub1 = h1gh1,
         health_sub2 = h2gh1,
         health_sub3 = h3gh1,
         health_sub4 = h4gh1,
         health_sub = (health_sub3+health_sub4)/2) %>%
  mutate_at(vars(c4var034,c4var045,c4var047), ~recode(., 
                                                      `8`=NaN)) %>%
  mutate(obese        = c4var034,
         hypertension = c4var045,
         cholesterol =  c4var047) %>%
  mutate_at(vars(starts_with("c4var")), ~ scale(., center = TRUE, scale = TRUE)) %>% 
  rowwise() %>%
  mutate(health_obj = ifelse(all(is.na(c(c4var034,c4var045,c4var047))), NaN, 
                             sum(-c4var034,-c4var045,-c4var047, na.rm = TRUE))) %>%
  select(aid, health_sub, health_sub1, health_sub2,health_sub3, health_sub4,obese,hypertension,cholesterol,c4var034, c4var045, c4var047,health_obj)

rm(home_w1_temp,home_w2_temp,home_w3_temp, home_w4_temp, constr_w4_temp)

##############
# Education Parents
##############

eduparent <- home_w1 %>%
  select(AID, H1NM4, H1RM1, H1NF4, H1RF1, PC1, PA12) %>%
  rename(aid = AID,
         edumother_nonres = H1NM4,
         edumother_res = H1RM1,
         edufather_nonres = H1NF4,
         edufather_res = H1RF1,
         eduparent = PA12,
         rel = PC1) %>%
  mutate(aid = as.character(aid),
         rel = case_when(rel == 1 ~ "mother",
                         rel == 9 ~ "father")) %>% 
  mutate_at(vars(edumother_nonres, edumother_res,
                 edufather_nonres, edufather_res, eduparent), ~ 
              case_when(. == 1 ~ 8, # eighth grade or less
                        . == 2 ~ 10, # more than eighth grade, but did not graduate from hs
                        . == 3 ~ 10, # went to a business, trade, or vocational school intead of hs
                        . == 4 ~ 12, # hs graduate
                        . == 5 ~ 12, # completed a GED
                        . == 6 ~ 14, # went to a busniess, trade, or vocational school after hs
                        . == 7 ~ 14, # went to college, but did not graduate
                        . == 8 ~ 16, # graduated from a college or university
                        . == 9 ~ 19, # professional training beyond a four-year college or university
                        . == 10 ~ 0,
                        . == 11 ~ 12, # she went to school, but I don't know what level
                        . == 12 ~ 12)) %>%
  mutate(edumother = case_when(!is.na(edumother_res) ~ edumother_res,
                               !is.na(edumother_nonres) ~ edumother_nonres,
                               rel == "mother" ~ eduparent),
         edufather = case_when(!is.na(edufather_res) ~ edufather_res,
                               !is.na(edufather_nonres) ~ edufather_nonres,
                               rel == "father" ~ eduparent)) %>%
  select(aid, edumother, edufather)

##############
# Race Parents
##############

raceparent <- home_w1 %>% 
  select(AID, PA1, PA4, PA6_1, PA6_2, PA8B, PB2, PB3, starts_with("PB5_")) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid),
         reth_parent = case_when(pa4==1 ~ 3, # hispanic
                                 pa6_2==1 & pa6_1==0 ~ 2, # black, non-hispanic
                                 pa6_1==1 & pa6_2==0 ~ 1, # white, non-hispanic
                                 pa6_2==1 & pa6_1==1 & pa8b==2 ~ 2, # black, non-hispanic
                                 pa6_2==1 & pa6_1==1 & pa8b==1 ~ 1), # white, non-hispanic
         reth_partner = case_when(pb3==1 ~ 3, # hispanic
                                  pb5_2==1 & pb5_1==0 ~ 2, # black, non-hispanic
                                  pb5_1==1 & pb5_2==0 ~ 1),# white, non-hispanic
         reth_mother = case_when(pa1 == 2 ~ reth_parent,
                                 pb2 == 2 ~ reth_partner),
         reth_father = case_when(pa1 == 1 ~ reth_parent,
                                 pb2 == 1 ~ reth_partner)) %>%
  select(aid, reth_mother, reth_father)

##############
# Age Parents
##############

home_w1_temp <- home_w1 %>% 
  select(AID, starts_with("H1HR3"), starts_with("H1HR6"), starts_with("H1HR7")) %>% 
  rename_all(tolower) %>%
  mutate(aid = as.character(aid)) %>% 
  mutate_at(vars(starts_with("h1hr3")), ~ replace(., !(. %in% c(11, 14)), NA)) %>%
  mutate_at(vars(starts_with("h1hr6")), ~ replace(., . == 97 | !(. %in% c(1, 7)), NA)) %>%
  mutate_at(vars(starts_with("h1hr7")), ~ replace(., . > 120, NA)) %>%
  pivot_longer(cols = starts_with("h1hr3"),
               names_to = "h1hr3",
               values_to = "relationship") %>%
  filter(complete.cases(relationship)) %>%
  pivot_longer(cols = starts_with("h1hr6"),
               names_to = "h1hr6",
               values_to = "biological") %>% 
  filter(complete.cases(biological)) %>% 
  pivot_longer(cols = starts_with("h1hr7"),
               names_to = "h1hr7",
               values_to = "age") %>% 
  filter(complete.cases(age)) %>%
  mutate_at(vars("h1hr3", "h1hr6", "h1hr7"), ~ substr(., 6, 6)) %>% 
  mutate(ageparent = ifelse(h1hr3 == h1hr6 & h1hr6 == h1hr7, age, NA)) %>%
  filter(complete.cases(ageparent)) %>%
  select(aid, relationship, ageparent) %>% 
  mutate(relationship = case_when(relationship == 14 ~ "mother",
                                  relationship == 11 ~ "father")) %>% 
  pivot_wider(names_from = relationship,
              names_glue = "age_{relationship}_w1",
              values_from = ageparent,
              values_fn = max)

home_w2_temp <- home_w2 %>% 
  select(AID, starts_with("H2HR4"), starts_with("H2HR7"), starts_with("H2HR9")) %>% 
  rename_all(tolower) %>%
  mutate(aid = as.character(aid)) %>% 
  mutate_at(vars(starts_with("h2hr4")), ~ replace(., !(. %in% c(11, 14)), NA)) %>%
  mutate_at(vars(starts_with("h2hr7")), ~ replace(., . == 97 | !(. %in% c(1, 7)), NA)) %>%
  mutate_at(vars(starts_with("h2hr9")), ~ replace(., . > 120, NA)) %>%
  pivot_longer(cols = starts_with("h2hr4"),
               names_to = "h2hr4",
               values_to = "relationship") %>%
  filter(complete.cases(relationship)) %>%
  pivot_longer(cols = starts_with("h2hr7"),
               names_to = "h2hr7",
               values_to = "biological") %>% 
  filter(complete.cases(biological)) %>% 
  pivot_longer(cols = starts_with("h2hr9"),
               names_to = "h2hr9",
               values_to = "age") %>% 
  filter(complete.cases(age)) %>%
  mutate_at(vars("h2hr4", "h2hr7", "h2hr9"), ~ substr(., 6, 6)) %>% 
  mutate(ageparent = ifelse(h2hr4 == h2hr7 & h2hr7 == h2hr9, age, NA)) %>%
  filter(complete.cases(ageparent)) %>%
  select(aid, relationship, ageparent) %>% 
  mutate(relationship = case_when(relationship == 14 ~ "mother",
                                  relationship == 11 ~ "father")) %>% 
  pivot_wider(names_from = relationship,
              names_glue = "age_{relationship}_w2",
              values_from = ageparent,
              values_fn = max)

ageparent <- home_w1_temp %>% 
  full_join(home_w2_temp, by = "aid")
rm(home_w1_temp, home_w2_temp)

##############
# Parental Investments
##############

parinvest <- home_w1 %>% 
  select(AID, starts_with("H1WP17"), starts_with("H1WP18"), H1WP1, H1WP2, H1WP3,H1WP4, H1WP5, H1WP6, H1WP7, H1WP8) %>%
  select(-H1WP17K, -H1WP18K) %>% 
  rename_all(tolower) %>%
  mutate(aid = as.character(aid)) %>% 
  mutate_at(vars(starts_with("h1wp17"), starts_with("h1wp18"), h1wp1, h1wp2, h1wp3, h1wp4, h1wp5, h1wp6, h1wp7), ~ replace(., . %in% 6:9, NA)) %>% 
  mutate_at(vars(h1wp8), ~ replace(., . %in% 96:98, NA)) %>% 
  mutate_at(vars(starts_with("h1wp")), ~ scale(., center = TRUE, scale = TRUE)) %>% 
  rowwise() %>%
  mutate(auth1    =-h1wp2,
         auth2    = ifelse(all(is.na(c(h1wp1, h1wp2, h1wp3, h1wp6))), NA,
                           sum(-h1wp1, -h1wp2, -h1wp3, -h1wp6, na.rm = TRUE)),
         auth3    = ifelse(all(is.na(c(h1wp1, h1wp2, h1wp3, h1wp4, h1wp5, h1wp6, h1wp7))), NA,
                           sum(-h1wp1, -h1wp2, -h1wp3, -h1wp4, -h1wp5, -h1wp6, -h1wp7, na.rm = TRUE)),
         inv_moth = ifelse(all(is.na(c(h1wp17a,h1wp17b,h1wp17c,h1wp17d,h1wp17e,h1wp17f,h1wp17h,h1wp17i,h1wp17j))), NA,
                           sum(h1wp17a,h1wp17b,h1wp17c,h1wp17d,h1wp17e,h1wp17f,h1wp17h,h1wp17i,h1wp17j, na.rm = TRUE)),
         inv_fath = ifelse(all(is.na(c(h1wp18a,h1wp18b,h1wp18c,h1wp18d,h1wp18e,h1wp18f,h1wp18h,h1wp18i,h1wp18j))), NA,
                           sum(h1wp17e,h1wp18a,h1wp18b,h1wp18c,h1wp18d,h1wp18e,h1wp18f,h1wp18h,h1wp18i,h1wp18j, na.rm = TRUE)),
         inv      = ifelse(all(is.na(c(h1wp17a,h1wp17b,h1wp17c,h1wp17d,h1wp17e,h1wp17f,h1wp17h,h1wp17i,h1wp17j,h1wp18a,h1wp18b,h1wp18c,h1wp18d,h1wp18e,h1wp18f,h1wp18h,h1wp18i,h1wp18j))), NA, 
                           sum(h1wp17a,h1wp17b,h1wp17c,h1wp17d,h1wp17e,h1wp17f,h1wp17h,h1wp17i,h1wp17j,h1wp18a,h1wp18b,h1wp18c,h1wp18d,h1wp18e,h1wp18f,h1wp18h,h1wp18i,h1wp18j, na.rm = TRUE))) %>%
  ungroup() %>%
  select(aid, auth1, auth2, auth3, inv_moth, inv_fath, inv)

##############
# Siblings
##############

sibs_temp0 <- sibling3 %>% 
  rename_all(tolower) %>% 
  select(contains("aid")) %>% 
  rename(aid1 = aid,
         aid2 = sib_aid1,
         aid3 = sib_aid2,
         aid4 = sib_aid3,
         aid5 = sib_aid4) %>% 
  mutate_all(~ as.character(.)) %>% 
  mutate_all(~ replace(., . == "", NA)) %>% 
  mutate_all(~ as.numeric(.)) %>%
  mutate(famid = rowSums(., na.rm = TRUE)) %>%
  mutate_all(~ as.character(.))

sibs_temp1 <- sibs_temp0 %>% select(famid, aid1) %>% rename(aid = aid1) %>% filter(complete.cases(.))
sibs_temp2 <- sibs_temp0 %>% select(famid, aid2) %>% rename(aid = aid2) %>% filter(complete.cases(.))
sibs_temp3 <- sibs_temp0 %>% select(famid, aid3) %>% rename(aid = aid3) %>% filter(complete.cases(.))
sibs_temp4 <- sibs_temp0 %>% select(famid, aid4) %>% rename(aid = aid4) %>% filter(complete.cases(.))
sibs_temp5 <- sibs_temp0 %>% select(famid, aid5) %>% rename(aid = aid5) %>% filter(complete.cases(.))

sibs_temp <- rbind(sibs_temp1, sibs_temp2, sibs_temp3, sibs_temp4, sibs_temp5) %>%
  mutate(famid = replace(famid, aid == "91316754", "123950262"),
         famid = replace(famid, aid %in% c("93578243", "90718947", "95578241"), "279875431"),
         famid = replace(famid, aid %in% c("94579298", "94579295", "94579296"), "283737889"),
         famid = replace(famid, aid %in% c("97572230", "91582938", "91712236"), "280867404"),
         famid = replace(famid, aid %in% c("98570417", "91880912", "99540116", "96540114"), "386531559"),
         famid = replace(famid, aid %in% c("91316754", "21316754", "11316754"), "123950262"),
         famid = replace(famid, aid %in% c("93573223", "91883921", "91583928"), "277041072"),
         famid = replace(famid, aid %in% c("98718344", "92578848", "92578846", "91888948"), "375764986"),
         famid = replace(famid, aid %in% c("94606921", "94606920", "94606923"), "283820764"),
         famid = replace(famid, aid %in% c("97504304", "96884904", "99884906", "99884908"), "394159022")) %>% 
  unique() %>% 
  arrange(famid) %>% 
  rename(famid_orig = famid)

famid_temp <- sibs_temp %>% select(famid_orig) %>% unique() %>% mutate(famid = as.character(1:n()))

sibs <- sibs_temp %>% 
  left_join(famid_temp, by = "famid_orig") %>%
  select(famid, aid) %>% 
  group_by(famid) %>% 
  mutate(famsize = n()) %>% 
  ungroup()

rm(sibs_temp0, sibs_temp1, sibs_temp2, sibs_temp3, sibs_temp4, sibs_temp5, sibs_temp, famid_temp)

##############
# Contextual variables
##############

context <- context1 %>%
  select(AID, BST90010, BST90005, BST90544, BST90550, BST90591, CAR90424, BST90754) %>%
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

##############
# Regions of residence
##############

census_regions <- w5group %>% 
  rename_all(tolower) %>%
  mutate_all(~ as.character(.)) %>% 
  mutate_at(vars(starts_with("w5group")), ~ replace(., . %in% c("999999999992",
                                                                "999999999999"), NA)) %>%
  full_join(w3group %>% 
              rename_all(tolower) %>%
              mutate_all(~ as.character(.)), by = "aid") %>% 
  full_join(w4group %>% 
              rename_all(tolower) %>%
              mutate_all(~ as.character(.)), by = "aid") %>% 
  full_join(w3region %>% 
              rename_all(tolower) %>%
              mutate_all(~ as.character(.)), by = "aid") %>% 
  full_join(w4region %>% 
              rename_all(tolower) %>%
              mutate_all(~ as.character(.)), by = "aid") %>%
  mutate(w1state = substr(w5group1, 1, 2),
         w2state = substr(w5group2, 1, 2),
         w3state = substr(w5group3, 1, 2),
         w4state = substr(w5group4, 1, 2),
         w5state = substr(w5group5, 1, 2),
         w1region = NA,
         w2region = NA,
         w5region = NA,
         w3region = replace(w3region, w3state == "39" & w3region == "2", "1"),
         w4region = replace(w4region, w4state == "1"  & w4region == "2", "3"),
         w4region = replace(w4region, w4state == "96" & w4region == "2", "3"),
         w4region = replace(w4region, w4state == "46" & w4region == "1", "2")) %>%
  select(aid, w1region, w1state, w2region, w2state, w3region, w3state, 
         w4region, w4state, w5region, w5state) %>%
  pivot_longer(cols = -aid,
               names_pattern = "w(.)(.*)",
               names_to = c("wave", ".value"),
               values_to = "value") %>%
  mutate_at(vars(-aid), ~ as.numeric(.)) %>% 
  group_by(state) %>% 
  mutate(region = ifelse(!is.na(state), round(mean(region, na.rm = TRUE), 0), region)) %>% 
  ungroup %>%
  pivot_wider(id_cols = aid,
              names_from = wave,
              names_glue = "w{wave}{.value}",
              values_from = c(region, state)) %>%
  arrange(aid, w1region, w1state, w2region, w2state, w3region, w3state, 
          w4region, w4state, w5region, w5state)

##############
# PGIs
##############
pgs <- gen_pgs %>%
  mutate(AID = as.character(AID)) %>%
  rename_with(~paste0("PGI1_", substring(., 3)), starts_with("PS")) %>%
  rename(ancestry = PGI1_ANCEST,
         race     = PGI1_RACE,
         pgs_edu=PGI1_EDU1,
         pgs_bmi=PGI1_BMI1, 
         pgs_adh=PGI1_ADHD2,
         pgs_dep=PGI1_DEP2, 
         pgs_int=PGI1_INTELL,
         pgs_smk=PGI1_EVRSK1, 
         pgs_slp=PGI1_SLEEPD,
         pc1=PGI1_PC1,
         pc2=PGI1_PC2,
         pc3=PGI1_PC3,
         pc4=PGI1_PC4,
         pc5=PGI1_PC5,
         pc6=PGI1_PC6,
         pc7=PGI1_PC7,
         pc8=PGI1_PC8,
         pc9=PGI1_PC9,
         pc10=PGI1_PC10,
         pc11=PGI1_PC11,
         pc12=PGI1_PC12,
         pc13=PGI1_PC13,
         pc14=PGI1_PC14,
         pc15=PGI1_PC15,
         pc16=PGI1_PC16,
         pc17=PGI1_PC17,
         pc18=PGI1_PC18,
         pc19=PGI1_PC19,
         pc20=PGI1_PC20) %>%
  rename_all(tolower) 

##############
# PVT
##############

pvt <- ahpvt_w3 %>% 
  select(AID, PVTPCT1C, PVTPCT3) %>% 
  rename_all(tolower) %>%
  rename(pvt_lag = pvtpct1c,
         pvt = pvtpct3) %>% 
  mutate(aid = as.character(aid),
         pvt_lag = replace(pvt_lag, pvt_lag > 100, NA),
         pvt = replace(pvt, pvt > 100, NA))

##############
# Educational degrees
##############

home_w1_temp <- home_w1 %>% 
  select(AID, H1GI1M, H1GI1Y) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid),#
         h1gi1m = replace(h1gi1m, h1gi1m == 96, NA),
         h1gi1y = replace(h1gi1y, h1gi1y == 96, NA),
         h1gi1y = h1gi1y + 1900)

home_w3_temp <- home_w3 %>% 
  select(AID, H3ED1, H3ED2, H3ED3, H3ED4, H3ED5, H3ED6, 
         H3ED7, H3ED8, H3ED12Y, H3ED13Y, H3ED14Y, H3ED15Y, 
         H3ED18Y, H3ED21Y, H3OD1M, H3OD1Y, IMONTH3, IYEAR3) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w4_temp <- home_w4 %>% 
  select(AID, H4ED1, H4ED2, H4ED3A:H4ED3E, H4ED4A:H4ED4E, IMONTH4, IYEAR4) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w5_temp <- home_w5 %>% 
  select(AID, H5OD11, IMONTH5, IYEAR5) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

degree_self <- home_w1_temp %>% 
  full_join(home_w3_temp, by = "aid") %>% 
  full_join(home_w4_temp, by = "aid") %>% 
  full_join(home_w5_temp, by = "aid") %>% 
  mutate(hs3 = case_when(h3ed2==1 | h3ed3==1 ~ 1,
                         !(h3ed1 %in% c(96, 98, 99)) ~ 0),
         hs3_year = case_when(hs3==1 & h3ed13y<9990 ~ h3ed13y,
                              hs3==1 & h3ed12y<9990 ~ h3ed12y),
         coll2y3 = case_when(h3ed4==1 | h3ed5==1 | h3ed6==1 | h3ed7==1 | h3ed8==1 ~ 1,
                             !(h3ed1 %in% c(96, 98, 99)) ~ 0),
         coll2y3_year = case_when(coll2y3==1 & h3ed14y<9990 ~ h3ed14y),
         coll4y3 = case_when(h3ed5==1 | h3ed6==1 | h3ed7==1 | h3ed8==1 ~ 1,
                             !(h3ed1 %in% c(96, 98, 99)) ~ 0),
         coll4y3_year = case_when(coll4y3==1 & h3ed15y<9990 ~ h3ed15y),
         eduyears3 = case_when(h3ed1<96 & h3ed1>=8 ~ h3ed1,
                               h3ed1<8 ~ 8),
         age3 = ifelse(imonth3<h1gi1m, iyear3-h1gi1y-1, iyear3-h1gi1y),
         postgrad3 = case_when(h3ed6==1 | h3ed7==1 | h3ed8==1 ~ 1,
                               !(h3ed1 %in% c(96, 98, 99)) ~ 0),
         postgrad3_year = case_when(postgrad3==1 & h3ed21y<9990 ~ h3ed21y,
                                    postgrad3==1 & h3ed18y<9990 ~ h3ed18y)) %>% 
  mutate(hs3 = replace(hs3, hs3==1 & h3ed1<10, NA),
         hs3_year = replace(hs3_year, is.na(hs3), NA),
         coll2y3 = replace(coll2y3, (coll2y3==1 & h3ed1<14), NA),
         coll2y3_year = replace(coll2y3_year, is.na(coll2y3), NA),
         coll4y3 = replace(coll4y3,(coll4y3==1 & h3ed1<16), NA),
         coll4y3_year = replace(coll4y3_year, is.na(coll4y3), NA),
         postgrad3 = replace(postgrad3, (postgrad3==1 & h3ed1<18), NA),
         postgrad3_year = replace(postgrad3_year, is.na(postgrad3), NA)) %>% 
  mutate(hs4 = case_when(h4ed1==1 | h4ed1==2 ~ 1,
                         !(h4ed2 %in% c(96, 98)) ~ 0),
         hs4_year = NA,
         coll2y4 = case_when((h4ed3a>=3 & h4ed3a<=8) | (h4ed3b>=3 & h4ed3b<=8) | (h4ed3c>=3 & h4ed3c<=8) | (h4ed3d>=3 & h4ed3d<=6) | (h4ed3e>=3 & h4ed3e<=3) ~ 1,
                             !(h4ed2 %in% c(96, 98)) ~ 0),
         coll2y4_year = case_when(coll2y4==1 & h4ed3e==3 ~ h4ed4e,
                                  coll2y4==1 & h4ed3d==3 ~ h4ed4d,
                                  coll2y4==1 & h4ed3c==3 ~ h4ed4c,
                                  coll2y4==1 & h4ed3b==3 ~ h4ed4b,
                                  coll2y4==1 & h4ed3a==3 ~ h4ed4a),
         coll4y4 = case_when((h4ed3a>=4 & h4ed3a<=8) | (h4ed3b>=4 & h4ed3b<=8) | (h4ed3c>=4 & h4ed3c<=8) | (h4ed3d>=4 & h4ed3d<=6) ~ 1,
                             !(h4ed2 %in% c(96, 98)) ~ 0),
         coll4y4_year = case_when(coll4y4==1 & h4ed3e==4 ~ h4ed4e,
                                  coll4y4==1 & h4ed3d==4 ~ h4ed4d,
                                  coll4y4==1 & h4ed3c==4 ~ h4ed4c,
                                  coll4y4==1 & h4ed3b==4 ~ h4ed4b,
                                  coll4y4==1 & h4ed3a==4 ~ h4ed4a),
         postgrad4 = case_when((h4ed3a>=6 & h4ed3a<=8) | (h4ed3b>=6 & h4ed3b<=8) | (h4ed3c>=6 & h4ed3c<=8) | (h4ed3d>=6 & h4ed3d<=6) ~ 1,
                               !(h4ed2 %in% c(96, 98)) ~ 0),
         postgrad4_year = case_when(postgrad4==1 & h4ed3d==6  ~ h4ed4d,
                                    postgrad4==1 & h4ed3c>=5 & h4ed3c<=8 ~ h4ed4c,
                                    postgrad4==1 & h4ed3b>=5 & h4ed3b<=8 ~ h4ed4b,
                                    postgrad4==1 & h4ed3a>=5 & h4ed3a<=8 ~ h4ed4a),
         eduyears4 = case_when(h4ed2==1 ~ 8,    # 8th grade or less
                               h4ed2==2 ~ 10,   # some hs
                               h4ed2==3 ~ 12,   # hs graduate
                               h4ed2==4 ~ 13,   # some vocational/technical training (after hs)
                               h4ed2==5 ~ 14,   # completed vocational/technical training (after hs)
                               h4ed2==6 ~ 14,   # some college
                               h4ed2==7 ~ 16,   # completed college (bachelor's degree)
                               h4ed2==8 ~ 17,   # some graduate school
                               h4ed2==9 ~ 18,   # completed a master's degree
                               h4ed2==10 ~ 19,  # some graduate training beyond a master's degree
                               h4ed2==11 ~ 20,  # completed a doctoral degree
                               h4ed2==12 ~ 18,  # some post baccalaureate professional education
                               h4ed2==13 ~ 19), # completed post bacc. prof. education
         age4 = ifelse(imonth4<h1gi1m, iyear4-h1gi1y-1, iyear4-h1gi1y)) %>% 
  mutate(hs4 = replace(hs4, hs4==1 & h4ed2<2, NA),
         hs4_year = replace(hs4_year, is.na(hs4), NA),
         coll2y4 = replace(coll2y4, (coll2y4==1 & h4ed2<=4), NA),
         coll2y4_year = replace(coll2y4_year, is.na(coll2y4), NA),
         coll4y4 = replace(coll4y4, (coll4y4==1 & h4ed2<=6), NA),
         coll4y4_year = replace(coll4y4_year, is.na(coll4y4), NA),
         postgrad4 = replace(postgrad4, (postgrad4==1 & h4ed2<=8), NA),
         postgrad4_year = replace(postgrad4_year, is.na(postgrad4), NA)) %>%
  mutate(hs5 = case_when(h5od11>2 & h5od11<17 ~ 1,
                         !(is.na(h5od11)) ~ 0),
         coll2y5 = case_when(h5od11>7 & h5od11<17 & h5od11!=9 ~ 1,
                             !(is.na(h5od11)) ~ 0),
         coll4y5 = case_when(h5od11>9 & h5od11<17 ~ 1,
                             !(is.na(h5od11)) ~ 0),
         postgrad5 = case_when(h5od11>11 & h5od11<17 & h5od11!=15 ~ 1,
                               !(is.na(h5od11)) ~ 0),
         eduyears5 = case_when(h5od11==1 ~ 8,
                               h5od11==2 ~ 10,
                               h5od11==3 ~ 12,
                               h5od11==4 ~ 12,
                               h5od11==5 ~ 13,
                               h5od11==6 ~ 14,
                               h5od11==7 ~ 14,
                               h5od11==8 ~ 14,
                               h5od11==9 ~ 14,
                               h5od11==10 ~ 16,
                               h5od11==11 ~ 17,
                               h5od11==12 ~ 18,
                               h5od11==13 ~ 19,
                               h5od11==14 ~ 20,
                               h5od11==15 ~ 18,
                               h5od11==16 ~ 19),
         age5 = ifelse(imonth5<h1gi1m,iyear5-h1gi1y-1, iyear5-h1gi1y)) %>% 
  mutate(hs = case_when(!is.na(hs5) ~ hs5,
                        is.na(hs5) & age4>=27 ~ hs4,
                        is.na(hs5) & age3>=27 ~ hs3), 
         coll2y = case_when(!is.na(coll2y5) ~ coll2y5,
                            is.na(coll2y5) & age4>=27 ~ coll2y4,
                            is.na(coll2y5) & age3>=27 ~ coll2y3), 
         coll4y = case_when(!is.na(coll4y5) ~ coll4y5,
                            is.na(coll4y5) & age4>=27 ~ coll4y4,
                            is.na(coll4y5) & age3>=27 ~ coll4y3), 
         postgrad = case_when(!is.na(postgrad5) ~ postgrad5,
                              is.na(postgrad5) & age4>=27 ~ postgrad4,
                              is.na(postgrad5) & age3>=27 ~ postgrad3), 
         eduyears = case_when(!is.na(eduyears5) ~ eduyears5,
                              is.na(eduyears5) & age4>=27 ~ eduyears4,
                              is.na(eduyears5) & age3>=27 ~ eduyears3),
         hs_year = pmin(hs3_year, hs4_year, na.rm = TRUE), 
         coll2y_year = pmin(coll2y3_year, coll2y4_year, na.rm = TRUE), 
         coll4y_year = pmin(coll4y3_year, coll4y4_year, na.rm = TRUE), 
         postgrad_year = pmin(postgrad3_year, postgrad4_year, na.rm = TRUE)) %>% 
  mutate(hs = replace(hs, hs!= 1 & (coll2y==1 | coll4y==1 | postgrad==1), 1),
         coll2y = replace(coll2y, coll2y!=1 & (coll4y==1 | postgrad==1), 1),
         coll4y = replace(coll4y, coll4y!=1 & (postgrad==1), 1)) %>% 
  select(aid, hs, hs_year, coll2y, coll2y_year, coll4y, coll4y_year, 
         postgrad, postgrad_year, eduyears)

rm(home_w1_temp, home_w3_temp, home_w4_temp, home_w5_temp)

##############
# Socioemotional skills
##############

home_w1_temp <- home_w1 %>% 
  select(AID, H1PF16) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w2_temp <- home_w2 %>% 
  select(AID, H2PF15, H2PF28, H2PF35) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w3_temp <- home_w3 %>% 
  select(AID, H3SP18, H3SP23, H3SP24) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))

home_w4_temp <- home_w4 %>% 
  select(AID, H4PE1, H4PE2, H4PE3, H4PE4, H4PE5, H4PE9, H4PE10, 
         H4PE11, H4PE12, H4PE13, H4PE17, H4PE18, H4PE19, H4PE20, 
         H4PE21, H4PE25, H4PE26, H4PE27, H4PE28, H4PE29, H4PE34, 
         H4PE35, H4PE36) %>% 
  rename_all(tolower) %>% 
  mutate(aid = as.character(aid))


socioemo <- home_w1_temp %>% 
  full_join(home_w2_temp, by = "aid") %>% 
  full_join(home_w3_temp, by = "aid") %>% 
  full_join(home_w4_temp, by = "aid") %>%
  mutate_at(vars(-aid), ~ replace(., . %in% c(6, 8, 9, 96, 98, 99), NA)) %>%
  mutate_at(vars(h4pe1, h4pe17, h4pe2,
                 h4pe18, h4pe3, h4pe19,
                 h4pe4, h4pe20, h4pe5, h1pf16),
            ~ max(., na.rm = TRUE) + min(., na.rm = TRUE) - .) %>%
  rowwise() %>%
  mutate(extra = mean(c(h4pe1, h4pe17, h4pe9, h4pe25)),
         agree = mean(c(h4pe2, h4pe18, h4pe10, h4pe26)),
         consc = mean(c(h4pe3, h4pe19, h4pe11, h4pe27)),
         neuro = mean(c(h4pe4, h4pe20, h4pe12, h4pe28)),
         intel = mean(c(h4pe5, h4pe21, h4pe13, h4pe29))) %>%
  ungroup() %>%
  mutate(risk2 = h2pf28,
         risk3 = h3sp23,
         risk4 = h4pe35,
         risk = (risk3+risk4)/2,
         pat2 = h2pf35,
         pat3 = h3sp24,
         pat4 = h4pe36,
         pat = (pat3+pat4)/2,
         struc2 = h2pf15,
         struc3 = h3sp18,
         struc4 = h4pe34) %>%
  select(aid, extra, agree, consc, neuro, intel,
         risk2, risk3, risk4, risk,
         pat2, pat3, pat4, pat,
         struc2, struc3, struc4)

rm(home_w1_temp, home_w2_temp, home_w3_temp, home_w4_temp)

##############
# Other control variables
##############

controls <- home_w1 %>% 
  select(AID, H1RM2, H1RF2, H1NM6, H1NF6,
         H1GI3, H1GI11, PC20, H1HR14, H1HR15) %>%
  rename(bornus_resmom = H1RM2,
         bornus_resdad = H1RF2,
         bornus_biomom = H1NM6,
         bornus_biodad = H1NF6,
         bornus = H1GI11,
         breastfed = PC20) %>%
  mutate_at(vars(starts_with("bornus_")), 
            ~ case_when(. == 1 ~ 0,
                        . == 0 ~ 1)) %>%
  mutate(aid = as.character(AID),
         bornus = case_when(bornus == 1 ~ 0,
                            H1GI3 == 0 ~ 0,
                            bornus == 0 ~ 1),
         bornus_mother = case_when(!is.na(bornus_resmom) ~ bornus_resmom,
                                   !is.na(bornus_biomom) ~ bornus_biomom),
         bornus_father = case_when(!is.na(bornus_resdad) ~ bornus_resdad,
                                   !is.na(bornus_biodad) ~ bornus_biodad),
         breastfed = case_when(breastfed %in% 1:6 ~ 1,
                               breastfed == 7 ~ 0),
         firstborn = case_when(H1HR14 == 1 ~ 1,
                               H1HR15 == 1 ~ 1,
                               H1HR15 %in% 2:15 ~ 0)) %>%
  select(aid, bornus, bornus_mother, bornus_father, breastfed, firstborn)

##############
# Income
##############

inc_w5 <- home_w5 %>% 
  select(AID, H5EC1, H5EC2) %>% 
  rename(pincw5 = H5EC1) %>% 
  rename(fincw5 = H5EC2) %>% 
  mutate(aid   = as.character(AID),
         pincw5 = case_when(pincw5 == 1 ~ 2500,
                            pincw5 == 2 ~ 7500,
                            pincw5 == 3 ~ 12500,
                            pincw5 == 4 ~ 17500,
                            pincw5 == 5 ~ 22500,
                            pincw5 == 6 ~ 27500,
                            pincw5 == 7 ~ 35000,
                            pincw5 == 8 ~ 45000,
                            pincw5 == 9 ~ 57500,
                            pincw5 == 10 ~ 87500,
                            pincw5 == 11 ~ 125000,
                            pincw5 == 12 ~ 175000,
                            pincw5 == 13 ~ 500000),
         fincw5 = case_when(fincw5 == 1 ~ 2500,
                            fincw5 == 2 ~ 7500,
                            fincw5 == 3 ~ 12500,
                            fincw5 == 4 ~ 17500,
                            fincw5 == 5 ~ 22500,
                            fincw5 == 6 ~ 27500,
                            fincw5 == 7 ~ 35000,
                            fincw5 == 8 ~ 45000,
                            fincw5 == 9 ~ 57500,
                            fincw5 == 10 ~ 87500,
                            fincw5 == 11 ~ 125000,
                            fincw5 == 12 ~ 175000,
                            fincw5 == 13 ~ 500000)) %>% 
  select(aid, pincw5, fincw5)

##############
# Age
##############

age_home_w1 <- home_w1 %>%
  select(AID, H1GI1M, H1GI1Y, IMONTH, IYEAR) %>%
  rename(aid = AID,
         mbirth1 = H1GI1M,
         ybirth1 = H1GI1Y) %>%
  mutate(aid = as.character(aid),
         mbirth1 = replace(mbirth1, mbirth1 == 96, NA),
         ybirth1 = replace(ybirth1, ybirth1 == 96, NA),
         ybirth1 = ybirth1 + 1900,
         IYEAR = IYEAR + 1900)

age_home_w2 <- home_w2 %>%
  select(AID, H2GI1M, H2GI1Y, IMONTH2, IYEAR2) %>%
  rename(aid = AID,
         mbirth2 = H2GI1M,
         ybirth2 = H2GI1Y) %>%
  mutate(aid = as.character(aid),
         mbirth2 = replace(mbirth2, mbirth2 == 98, NA),
         ybirth2 = replace(ybirth2, ybirth2 == 98, NA),
         ybirth2 = ybirth2 + 1900,
         IYEAR2 = IYEAR2 + 1900)

age_home_w3 <- home_w3 %>%
  select(AID, H3OD1M, H3OD1Y, IMONTH3, IYEAR3) %>%
  rename(aid = AID,
         mbirth3 = H3OD1M,
         ybirth3 = H3OD1Y) %>%
  mutate(aid = as.character(aid))

age_home_w4 <- home_w4 %>%
  select(AID, H4OD1M, H4OD1Y, IMONTH4, IYEAR4) %>%
  rename(aid = AID,
         mbirth4 = H4OD1M,
         ybirth4 = H4OD1Y) %>%
  mutate(aid = as.character(aid))

age <- age_home_w1 %>%
  full_join(age_home_w2, by = "aid") %>%
  full_join(age_home_w3, by = "aid") %>%
  full_join(age_home_w4, by = "aid") %>%
  mutate(birth1 = ybirth1*12 + mbirth1,
         birth2 = ybirth2*12 + mbirth2,
         birth3 = ybirth3*12 + mbirth3,
         birth4 = ybirth4*12 + mbirth4,
         date_home_w1 = IYEAR*12 + IMONTH,
         date_home_w2 = IYEAR2*12 + IMONTH2,
         date_home_w3 = IYEAR3*12 + IMONTH3,
         date_home_w4 = IYEAR4*12 + IMONTH4) %>%
  mutate(birth = round(rowMeans(cbind(birth1, birth2, birth3, birth4), na.rm = TRUE), 0),
         age_home_w1 = date_home_w1 - birth,
         age_home_w2 = date_home_w2 - birth,
         age_home_w3 = date_home_w3 - birth,
         age_home_w4 = date_home_w4 - birth,
         ybirth = factor(round(rowMeans(cbind(ybirth1, ybirth2, ybirth3, ybirth4), na.rm = TRUE), 0),
                         levels = c(1974:1983),
                         labels = c(1974:1983))) %>%
  select(aid, ybirth, age_home_w1, age_home_w2, age_home_w3, age_home_w4)

rm(age_home_w1, age_home_w2, age_home_w3, age_home_w4)


##############
# Bartik shocks
##############

bartik <- read_dta("temp/bartik.dta")

add_columns <- function(df, column){
  new <- as.numeric(substr(column, 8, 11))
  names(new) <- column
  mutate(df, !!!new)
}

bartik_wage <- bartik %>% 
  arrange(sex, educ, reth, cregion, year) %>% 
  add_columns(column = paste0("ybirth_", 1974:1986)) %>% 
  mutate_at(vars(starts_with("ybirth_")), ~ ifelse(year >= . & year < .+14, 1, 0)) %>%
  mutate(ybirth_1974 = ifelse(year >= 1974 & year < 1974+14, 1, 0),
         ybirth_1975 = ifelse(year >= 1975 & year < 1975+14, 1, 0),
         ybirth_1976 = ifelse(year >= 1976 & year < 1976+14, 1, 0)) %>%
  pivot_longer(cols = starts_with("ybirth"),
               names_to = "ybirth",
               names_pattern = "ybirth_(.*)",
               values_to = "ind") %>%
  filter(ind == 1) %>%
  select(-year, -ind) %>%
  group_by(ybirth, sex, educ, reth, cregion) %>% 
  summarise(b_wage_mean = mean(b_wage),
            b_wage_sd = sd(b_wage), 
            .groups = "drop") %>%
  rename(gender = sex)

##############
# Weights
##############

weights1 <- w1homewc %>% 
  select(AID, W1_WC) %>%
  rename_all(tolower) %>% 
  rename(w1 = w1_wc) %>% 
  mutate(aid = as.character(aid))

weights2 <- w2homewc %>% 
  select(AID, W2_WC) %>% 
  rename_all(tolower) %>% 
  rename(w2 = w2_wc) %>% 
  mutate(aid = as.character(aid))

weights3 <- w3homewc %>% 
  select(AID, W3_2_WC) %>% 
  rename_all(tolower) %>% 
  rename(w3 = w3_2_wc) %>% 
  mutate(aid = as.character(aid))

weights4 <- w4homewc %>% 
  select(AID, W4_2_WC) %>% 
  rename_all(tolower) %>% 
  rename(w4 = w4_2_wc) %>% 
  mutate(aid = as.character(aid))

weights5 <- weights5 %>% 
  select(AID, GSW5) %>% 
  rename_all(tolower) %>%
  rename(w5 = gsw5) %>% 
  mutate(aid = as.character(aid))

weights <- weights1 %>% 
  full_join(weights2, by = "aid") %>% 
  full_join(weights3, by = "aid") %>% 
  full_join(weights4, by = "aid") %>% 
  full_join(weights5, by = "aid") 
weights$w0 <- 1

rm(weights1, weights2, weights3, weights4, weights5)


##############
# Merge data
##############
data_hcf <- home1 %>% 
  full_join(race, by = "aid") %>% 
  full_join(ses, by = "aid") %>% 
  full_join(health, by = "aid") %>% 
  full_join(eduparent, by = "aid") %>% 
  full_join(raceparent, by = "aid") %>%
  full_join(ageparent, by = "aid") %>%
  full_join(parinvest, by = "aid") %>%
  full_join(sibs, by = "aid") %>% 
  full_join(context, by = "aid") %>% 
  full_join(census_regions, by = "aid") %>% 
  full_join(pgs, by = "aid") %>% 
  full_join(pvt, by = "aid") %>% 
  full_join(degree_self, by = "aid") %>% 
  full_join(socioemo, by = "aid") %>%
  full_join(controls, by = "aid") %>% 
  full_join(inc_w5, by = "aid") %>% 
  full_join(age, by = "aid") %>% 
  # full_join(placebo, by = "aid") %>% 
  mutate(gender_mother = 2,
         gender_father = 1,
         educ_mother = case_when(edumother < 12 ~ 1,  # less than hs
                                 edumother == 12 ~ 2,      # hs
                                 edumother > 12 ~ 3),      # more than hs
         educ_father = case_when(edufather < 12 ~ 1,  # less than hs
                                 edufather == 12 ~ 2, # hs
                                 edufather > 12 ~ 3), # more than hs
         reth_mother = ifelse(is.na(reth_mother), reth_aid, reth_mother),
         reth_father = ifelse(is.na(reth_father), reth_aid, reth_father),
         age_mother_atbirth_w1 = age_mother_w1 - age_home_w1/12,
         age_mother_atbirth_w2 = age_mother_w2 - age_home_w2/12) %>% 
  mutate(age_mother_atbirth_w1 = replace(age_mother_atbirth_w1, age_mother_atbirth_w1 < 16, NA),
         age_mother_atbirth_w2 = replace(age_mother_atbirth_w2, age_mother_atbirth_w2 < 16, NA)) %>% 
  mutate(agemum = case_when(!is.na(age_mother_atbirth_w1) ~ age_mother_atbirth_w1,
                            is.na(age_mother_atbirth_w1) ~ age_mother_atbirth_w2)) %>% 
  left_join(bartik_wage, by = c("ybirth", 
                                "gender_mother" = "gender", 
                                "educ_mother" = "educ", 
                                "reth_mother" = "reth",
                                "w1region" = "cregion")) %>%
  rename(bartik_mean_mother = b_wage_mean,
         bartik_sd_mother = b_wage_sd) %>% 
  left_join(bartik_wage, by = c("ybirth", 
                                "gender_father" = "gender", 
                                "educ_father" = "educ", 
                                "reth_father" = "reth",
                                "w1region" = "cregion")) %>%
  rename(bartik_mean_father = b_wage_mean,
         bartik_sd_father = b_wage_sd) %>% 
  left_join(weights, by = "aid") %>% 
  mutate(scid = scid <- as.numeric(scid))  %>% 
  select(-gender_mother, -gender_father, -educ_mother, -educ_father, 
         -reth_aid,
         -age_mother_atbirth_w1, -age_mother_atbirth_w2,
         -age_mother_w1, -age_mother_w2,
         -age_father_w1, -age_father_w2, -w1, -w2, -w3, -w4, -w5)

data_hcf <- subset(data_hcf, aid != "")

write_dta(data_hcf, path = "temp/data_individual.dta")


rm(list = ls())
